S-FFSD.csv¶

InĀ [1]:
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # For easier feature plotting
import re

plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']   # Set font to Microsoft YaHei for Chinese characters
plt.rcParams['axes.unicode_minus'] = False # Ensure minus sign displays correctly

try:
    df = pd.read_csv('./data/S-FFSD.csv')
    print("S-FFSD.csv loaded successfully!")
except FileNotFoundError:
    print("Error: S-FFSD.csv not found. Make sure the file is in './data/'.")
    exit()

df['Source_Num'] = df['Source'].apply(lambda x: int(re.search(r'\d+', x).group()))
df['Target_Num'] = df['Target'].apply(lambda x: int(re.search(r'\d+', x).group()))

type_counts = df['Type'].value_counts()
total_count = len(df)
type_percentages = type_counts / total_count
df['Type'] = df['Type'].apply(lambda x: 'other' if type_percentages[x] < 0.05 else x)

fig, axes = plt.subplots(2, 2, figsize=(15, 15))

sns.histplot(df['Source_Num'], bins=20, kde=False, ax=axes[0, 0])
axes[0, 0].set_title('Sourceę•°å€¼éƒØåˆ†ēš„åˆ†åøƒē›“ę–¹å›¾-刘锔圆', fontsize=16)
axes[0, 0].set_xlabel('数值', fontsize=12)
axes[0, 0].set_ylabel('频欔', fontsize=12)

sns.histplot(df['Target_Num'], bins=20, kde=False, ax=axes[0, 1])
axes[0, 1].set_title('Targetę•°å€¼éƒØåˆ†ēš„åˆ†åøƒē›“ę–¹å›¾-刘锔圆', fontsize=16)
axes[0, 1].set_xlabel('数值', fontsize=12)
axes[0, 1].set_ylabel('频欔', fontsize=12)
amount_bins = pd.qcut(df['Amount'], q=5)
amount_bin_counts = amount_bins.value_counts()
amount_bin_counts.plot(kind='pie', autopct='%1.1f%%', ax=axes[1, 0])

axes[1, 0].set_title('Amountēš„åˆ†åøƒé„¼å›¾-刘锔圆')
axes[1, 0].set_ylabel('')
type_counts_updated = df['Type'].value_counts()

type_counts_updated.plot(kind='pie', autopct='%1.1f%%', ax=axes[1, 1])

axes[1, 1].set_title('Typeēš„åˆ†åøƒé„¼å›¾-刘锔圆')

axes[1, 1].set_ylabel('')

plt.figure(figsize=(8, 8)) 
labels_for_plot = df['Labels'].map({0: 'éžę¬ŗčÆˆ', 1: '欺诈', 2: 'ęœŖę ‡ę³Ø'}) 
labels_counts = labels_for_plot.value_counts()

labels_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=sns.color_palette("pastel"),
                   pctdistance=0.85, wedgeprops=dict(width=0.4)) # Add styling for a donut chart
plt.title('Labelsēš„åˆ†åøƒé„¼å›¾-刘锔圆', fontsize=16)
plt.tight_layout() 
plt.show() # Display the first figure

# --- New: Add pie chart for 'Labels' ---
S-FFSD.csv loaded successfully!
No description has been provided for this image
No description has been provided for this image

YelpChi.mat¶

äø‰ē§č¾¹ē±»åž‹ļ¼ˆå…³ē³»ēŸ©é˜µļ¼‰ļ¼š R-U-Rļ¼ˆåŒäø€ē”Øęˆ·å‘åøƒēš„čÆ„č®ŗļ¼‰ R-S-Rļ¼ˆå…·ęœ‰ē›øåŒę˜Ÿēŗ§čÆ„åˆ†ēš„åŒäø€äŗ§å“äø‹ēš„čÆ„č®ŗļ¼‰ R-T-Rļ¼ˆåœØåŒäø€äŗ§å“äø‹åŒäø€ęœˆä»½å‘åøƒēš„čÆ„č®ŗļ¼‰ ļ¼ˆęØŖēŗµč½“ę˜ÆčÆ„č®ŗļ¼‰ homoę˜Æäø‰č€…ēš„åˆå¹¶ēŸ©é˜µ

featuresę˜Æē‰¹å¾ēŸ©é˜µļ¼Œlabel是0ꈖ1č”Øē¤ŗę¬ŗčÆˆäøŽå¦

InĀ [2]:
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  
plt.rcParams['axes.unicode_minus'] = False
# Load the .mat file
try:
    yelp_data = scipy.io.loadmat('./data/YelpChi.mat')
    print("Data loaded successfully!")
    print("Available keys:", yelp_data.keys())
except FileNotFoundError:
    print("Error: YelpChi.mat not found.")
    exit()

homo_matrix = yelp_data['homo']
net_rur_matrix = yelp_data['net_rur']
net_rsr_matrix = yelp_data['net_rsr']
net_rtr_matrix = yelp_data['net_rtr']
features_matrix = yelp_data['features']
labels = yelp_data['label'].flatten() # Ensure labels are a 1D arraylabels are a 1D array
Data loaded successfully!
Available keys: dict_keys(['__header__', '__version__', '__globals__', 'homo', 'net_rur', 'net_rtr', 'net_rsr', 'features', 'label'])

å…³ē³»ēŸ©é˜µēš„ēƒ­åŠ›å›¾-YelpChi.mat¶

InĀ [3]:
def plot_relationship_heatmap(matrix, title, max_size=100):
    # Convert sparse matrix to dense and select a small block for visualization
    if hasattr(matrix, "toarray"):
        matrix = matrix[:max_size, :max_size].toarray()
    else:
        matrix = matrix[:max_size, :max_size]
    plt.figure(figsize=(9, 8))
    # Use 'binary' cmap if the matrix contains only 0s and 1s, otherwise 'viridis'
    cmap = 'binary' if np.all(np.isin(matrix, [0, 1])) else 'viridis'
    sns.heatmap(matrix, cmap=cmap, cbar_kws={'label': 'Connection (1) / No Connection (0)'})
    plt.title(title + f" (Top-left {max_size}x{max_size})-刘锔圆", fontsize=16)
    plt.xlabel("Review Index", fontsize=12)
    plt.ylabel("Review Index", fontsize=12)
    plt.tight_layout()
    plt.show()

plot_relationship_heatmap(net_rur_matrix, 'R-U-R Matrix (Same User Reviews)')
plot_relationship_heatmap(net_rsr_matrix, 'R-S-R Matrix (Same Star Rating, Same Product)')
plot_relationship_heatmap(net_rtr_matrix, 'R-T-R Matrix (Same Month, Same Product)')
plot_relationship_heatmap(homo_matrix, 'Homogeneous Matrix (Combined Relationships)')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

ē›“ę–¹å›¾ēš„å½¢ēŠ¶-YelpChi.mat¶

ä¼šå‘ŠčÆ‰ę‚Øå¤§å¤šę•°čÆ„č®ŗäøŽå…¶ä»–čÆ„č®ŗēš„čæžęŽ„ę•°é‡ę˜Æå¤šå°‘ć€‚ä¾‹å¦‚ļ¼Œå¦‚ęžœē›“ę–¹å›¾é›†äø­åœØč¾ƒä½Žēš„čæžęŽ„ę•°ļ¼Œč”Øē¤ŗå¤§å¤šę•°čÆ„č®ŗåŖäøŽå°‘ę•°å…¶ä»–čÆ„č®ŗęœ‰å…³ē³»ć€‚ é«˜čæžęŽ„čÆ„č®ŗēš„ęÆ”ä¾‹ļ¼š å¦‚ęžœē›“ę–¹å›¾ēš„å°¾éƒØč¾ƒé•æęˆ–å­˜åœØå¤šäøŖå³°å€¼ļ¼ŒåÆčƒ½č”Øē¤ŗęœ‰äø€éƒØåˆ†čÆ„č®ŗäøŽå…¶ä»–čÆ„č®ŗęœ‰ē€å¼‚åøøå¤šēš„čæžęŽ„ć€‚åœØę¬ŗčÆˆę£€ęµ‹äø­ļ¼Œäø€äøŖčÆ„č®ŗäøŽéžåøøå¤šēš„å…¶ä»–čÆ„č®ŗļ¼ˆå°¤å…¶ę˜Æé€ščæ‡åŒäø€ē”Øęˆ·ć€ē›øåŒę˜Ÿēŗ§ęˆ–ē›øåŒę—¶é—“å‘åøƒļ¼‰ęœ‰å…³ē³»ļ¼ŒåÆčƒ½ę˜Æę¬ŗčÆˆå›¢ä¼™ę“»åŠØēš„čæ¹č±”ć€‚ äøåŒå…³ē³»ē±»åž‹ēš„ē‰¹å¾ļ¼š ęÆ”č¾ƒ R-U-R态R-S-R态R-T-R 和 Homogeneous ēŸ©é˜µēš„čæžęŽ„ę•°ē›“ę–¹å›¾ļ¼Œę‚ØåÆä»„ēœ‹å‡ŗäøåŒē±»åž‹ēš„å…³ē³»åœØčÆ„č®ŗē½‘ē»œäø­ēš„åÆ†åŗ¦å’Œåˆ†åøƒē‰¹å¾ć€‚ä¾‹å¦‚ļ¼ŒR-U-R åÆčƒ½ę˜¾ē¤ŗęŸäŗ›ē”Øęˆ·å‘åøƒäŗ†å¤§é‡čÆ„č®ŗļ¼ŒåÆ¼č‡“čæ™äŗ›čÆ„č®ŗä¹‹é—“ęœ‰å¾ˆé«˜ēš„čæžęŽ„ę•°ć€‚

InĀ [4]:
def plot_connection_counts_histogram(matrix, title):
    if hasattr(matrix, "toarray"):
        dense_matrix = matrix.toarray()
    else:
        dense_matrix = matrix
    connection_counts = dense_matrix.sum(axis=1) # Sum across rows

    plt.figure(figsize=(10, 6))
    sns.histplot(connection_counts, bins=30, kde=True, color='skyblue')
    plt.title(f'Histogram of Connection Counts in {title}-刘锔圆', fontsize=16)
    plt.xlabel("Number of Connections per Review", fontsize=12)
    plt.ylabel("Number of Reviews", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()
plot_connection_counts_histogram(net_rur_matrix, 'R-U-R Matrix')
plot_connection_counts_histogram(net_rsr_matrix, 'R-S-R Matrix')
plot_connection_counts_histogram(net_rtr_matrix, 'R-T-R Matrix')
plot_connection_counts_histogram(homo_matrix, 'Homogeneous Matrix')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

é™ē»“ē‰¹å¾ę•°ę®-YelpChi.mat¶

é«˜ē»“ē‰¹å¾ę•°ę®ęŠ•å½±åˆ°äŗŒē»“ē©ŗé—“ļ¼Œå¹¶åÆč§†åŒ–čæ™äŗ›é™ē»“åŽēš„ę•°ę®ē‚¹ļ¼ŒåŒę—¶ę ¹ę®čÆ„č®ŗēš„ę¬ŗčÆˆę ‡ē­¾ļ¼ˆlabelsļ¼‰čæ›č”Œē€č‰²ć€‚

čæ™ę ·åšę˜Æäøŗäŗ†åœØę— ę³•ē›“ęŽ„åÆč§†åŒ–å¤šäŗŽ3ē»“ę•°ę®ę—¶ļ¼Œåø®åŠ©ęˆ‘ä»¬č§‚åÆŸę¬ŗčÆˆčÆ„č®ŗå’Œéžę¬ŗčÆˆčÆ„č®ŗåœØē‰¹å¾ē©ŗé—“äø­ę˜Æå¦å­˜åœØåÆåˆ†ē¦»ēš„ęØ”å¼ęˆ–čšē±»ć€‚

å¦‚ęžœäøåŒē±»åˆ«ēš„ē‚¹åœØé™ē»“åŽēš„ē©ŗé—“äø­čƒ½ę˜Žę˜¾åœ°čšęˆäøåŒēš„ē°‡ļ¼Œé‚£å°±č”Øę˜Žčæ™äŗ›ē‰¹å¾åÆ¹äŗŽåŒŗåˆ†ę¬ŗčÆˆå’Œéžę¬ŗčÆˆč”Œäøŗę˜Æęœ‰ę•ˆēš„ć€‚

InĀ [5]:
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

print("\n--- Visualizing Features with Dimensionality Reduction ---")

if features_matrix.shape[1] > 2: 
    svd = TruncatedSVD(n_components=2, random_state=42)
    features_svd = svd.fit_transform(features_matrix)

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=features_svd[:, 0], y=features_svd[:, 1], hue=labels, palette='coolwarm', alpha=0.7, s=30)
    plt.title('TruncatedSVD of Features, Colored by Fraud Label-刘锔圆', fontsize=16)
    plt.xlabel(f'SVD Component 1 ({svd.explained_variance_ratio_[0]*100:.2f}% variance)', fontsize=12)
    plt.ylabel(f'SVD Component 2 ({svd.explained_variance_ratio_[1]*100:.2f}% variance)', fontsize=12)
    plt.legend(title='Fraud Label', loc='best')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough features for SVD (less than 2 dimensions).")
if features_matrix.shape[0] > 1000 and features_matrix.shape[1] > 1: 
    print("\nDataset is large. t-SNE can be slow. Running on a sampled subset.")
    sample_size = 5000 # You can adjust this value (e.g., 2000, 3000, 4000, 5000)

    if features_matrix.shape[0] > sample_size:
        sample_indices = np.random.choice(features_matrix.shape[0], sample_size, replace=False)
        features_sample = features_matrix[sample_indices]
        labels_sample = labels[sample_indices]
    else:
        features_sample = features_matrix
        labels_sample = labels

    print(f"Applying t-SNE on a sample of {features_sample.shape[0]} reviews...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_jobs=-1) # Adjust perplexity (often 5-50)
    features_sample_dense = features_sample.toarray()
    features_tsne = tsne.fit_transform(features_sample_dense)

    current_labels = labels_sample

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=features_tsne[:, 0], y=features_tsne[:, 1], hue=current_labels, palette='coolwarm', alpha=0.7, s=30)
    plt.title('t-SNE of Features, Colored by Fraud Label-刘锔圆', fontsize=16) # Add "Sampled Data" to title
    plt.xlabel('t-SNE Component 1', fontsize=12)
    plt.ylabel('t-SNE Component 2', fontsize=12)
    plt.legend(title='Fraud Label', loc='best')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

else:
    print("Not enough data points or features for meaningful t-SNE visualization.")
--- Visualizing Features with Dimensionality Reduction ---
No description has been provided for this image
Dataset is large. t-SNE can be slow. Running on a sampled subset.
Applying t-SNE on a sample of 5000 reviews...
No description has been provided for this image

Amazon.mat¶

InĀ [6]:
try:
    amazon_data = scipy.io.loadmat('./data/Amazon.mat')
    print("\nAmazon Data loaded successfully!")
    print("Available keys for Amazon:", amazon_data.keys())
except FileNotFoundError:
    print("Error: Amazon.mat not found. Make sure the file is in './data/'.")
    exit()
homo_matrix_amazon = amazon_data['homo']
net_upu_matrix_amazon = amazon_data['net_upu']
net_usu_matrix_amazon = amazon_data['net_usu']
net_uvu_matrix_amazon = amazon_data['net_uvu']
features_matrix_amazon = amazon_data['features']
labels_amazon = amazon_data['label'].flatten() 
Amazon Data loaded successfully!
Available keys for Amazon: dict_keys(['__header__', '__version__', '__globals__', 'homo', 'net_upu', 'net_usu', 'net_uvu', 'features', 'label'])
InĀ [7]:
print("\n--- Visualizing Relationship Matrices (Amazon.mat) ---")
plot_relationship_heatmap(net_upu_matrix_amazon, 'U-P-U Matrix (Same Product, Same User)')
plot_relationship_heatmap(net_usu_matrix_amazon, 'U-S-U Matrix (Same Product, Same Star Rating)')
plot_relationship_heatmap(net_uvu_matrix_amazon, 'U-V-U Matrix (Same Product, Same Time)')
plot_relationship_heatmap(homo_matrix_amazon, 'Homogeneous Matrix (Combined Relationships)')
--- Visualizing Relationship Matrices (Amazon.mat) ---
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

ē›“ę–¹å›¾ēš„å½¢ēŠ¶-Amazon.mat¶

InĀ [8]:
plot_connection_counts_histogram(net_upu_matrix_amazon, 'U-P-U Matrix (Amazon)')
plot_connection_counts_histogram(net_usu_matrix_amazon, 'U-S-U Matrix (Amazon)')
plot_connection_counts_histogram(net_uvu_matrix_amazon, 'U-V-U Matrix (Amazon)')
plot_connection_counts_histogram(homo_matrix_amazon, 'Homogeneous Matrix (Amazon)')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

é™ē»“ē‰¹å¾ę•°ę®-Amazon.mat¶

InĀ [9]:
if features_matrix_amazon.shape[1] > 2:
    svd = TruncatedSVD(n_components=2, random_state=42)
    features_svd_amazon = svd.fit_transform(features_matrix_amazon)

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=features_svd_amazon[:, 0], y=features_svd_amazon[:, 1], hue=labels_amazon, palette='coolwarm', alpha=0.7, s=30)
    plt.title('TruncatedSVD of Features, Colored by Fraud Label (Amazon)-刘锔圆', fontsize=16)
    plt.xlabel(f'SVD Component 1 ({svd.explained_variance_ratio_[0]*100:.2f}% variance)', fontsize=12)
    plt.ylabel(f'SVD Component 2 ({svd.explained_variance_ratio_[1]*100:.2f}% variance)', fontsize=12)
    plt.legend(title='Fraud Label', loc='best')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough features for SVD (less than 2 dimensions) for Amazon.")

if features_matrix_amazon.shape[0] > 1000 and features_matrix_amazon.shape[1] > 1:
    print("\nDataset (Amazon) is large. t-SNE can be slow. Running on a sampled subset.")

    np.random.seed(42) 
    sample_size_amazon = 5000 

    if features_matrix_amazon.shape[0] > sample_size_amazon:
        sample_indices_amazon = np.random.choice(features_matrix_amazon.shape[0], sample_size_amazon, replace=False)
        features_sample_amazon = features_matrix_amazon[sample_indices_amazon]
        labels_sample_amazon = labels_amazon[sample_indices_amazon]
    else:
        features_sample_amazon = features_matrix_amazon
        labels_sample_amazon = labels_amazon

    print(f"Applying t-SNE on a sample of {features_sample_amazon.shape[0]} reviews from Amazon...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_jobs=-1)

    features_sample_dense_amazon = features_sample_amazon.toarray()
    features_tsne_amazon = tsne.fit_transform(features_sample_dense_amazon)

    current_labels_amazon = labels_sample_amazon

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=features_tsne_amazon[:, 0], y=features_tsne_amazon[:, 1], hue=current_labels_amazon, palette='coolwarm', alpha=0.7, s=30)
    plt.title('t-SNE of Features, Colored by Fraud Label (Amazon Sampled)-刘锔圆', fontsize=16)
    plt.xlabel('t-SNE Component 1', fontsize=12)
    plt.ylabel('t-SNE Component 2', fontsize=12)
    plt.legend(title='Fraud Label', loc='best')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough data points or features for meaningful t-SNE visualization for Amazon.")
No description has been provided for this image
Dataset (Amazon) is large. t-SNE can be slow. Running on a sampled subset.
Applying t-SNE on a sample of 5000 reviews from Amazon...
No description has been provided for this image
InĀ [10]:
!jupyter nbconvert --to html DataAnalysis.ipynb